import time
import myPymysql

'''
爬猫眼网站TOP100的电影数据：
    http://maoyan.com/board/4?offset=0
    http://maoyan.com/board/4?offset=10
    http://maoyan.com/board/4?offset=20
    http://maoyan.com/board/4?offset=90
'''
import requests
import random
import re
import logging

# 获取logger的实例
logger = logging.getLogger("maoyan")
# 指定logger的输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
# 文件日志，终端日志
file_handler = logging.FileHandler("maoyan.txt")
file_handler.setFormatter(formatter)

# 设置默认的级别
logger.setLevel(logging.INFO)
logger.addHandler(file_handler)

def get_one_page(url):
    """
    发起Http请求，获取Response的响应结果
    """
    ua_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
    reponse = requests.get(url, headers=ua_headers)
    if reponse.status_code == 200:  # ok
        return reponse.text
    return None

def write_to_sql(item):
    """
    把数据写入数据库
    """
    dbhelper = myPymysql.DBHelper()
    title_data = item['title']
    actor_data = item['actor']
    time_data = item['time']
    sql = "INSERT IGNORE INTO testdb.maoyan(title,actor,time) VALUES (%s,%s,%s);"
    params = (title_data, actor_data, time_data)
    result = dbhelper.execute(sql, params)
    if result == True:
        print("插入成功,电影名：" + item['title'])
    else:
        logger.error("execute: " + sql)
        logger.error("params: ", params)
        logger.error("插入失败")
        print("插入失败")

def parse_one_page(html):
    """
    从获取到的html页面中提取真实想要存储的数据：
    电影名，主演，上映时间
    """
    pattern = re.compile(
        '<p class="name">.*?title="([\s\S]*?)"[\s\S]*?<p class="star">([\s\S]*?)</p>[\s\S]*?<p class="releasetime">([\s\S]*?)</p>')
    items = re.findall(pattern, html)

    # yield在返回的时候会保存当前的函数执行状态
    for item in items:
        yield {
            'title': item[0].strip(),
            'actor': item[1].strip(),
            'time': item[2].strip()
        }


import matplotlib.pyplot as plt


def analysisCounry():
    # 从数据库表中查询出每个国家的电影数量来做分析
    dbhelper = myPymysql.DBHelper()
    # fetchCount
    Total = dbhelper.fetchCount("SELECT count(*) FROM `testdb`.`maoyan`;")
    Am = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%美国%";')
    Ch = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%中国%";')
    Jp = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%日本%";')
    LMNY = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%罗马尼亚%";')
    HG = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%韩国%";')
    FG = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%法国%";')
    YDL = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%意大利%";')
    Other = Total[0] - Am[0] - Ch[0] - Jp[0] - LMNY[0] - HG[0] - FG[0] - YDL[0]
    sizes = Am[0], Ch[0], Jp[0], LMNY[0], FG[0], HG[0], YDL[0], Other
    labels = '美国', 'China', 'Japan', '罗马尼亚', '韩国', '法国', '意大利', '其他'
    colors = 'b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'
    explode = 0, 0, 0, 0, 0, 0, 0, 0
    # 画出统计图表的饼状图
    plt.pie(sizes, explode=explode, labels=labels,
            colors=colors, autopct="%1.1f%%", shadow=True)
    plt.show()


def CrawlMovieInfo():
    """
    抓取电影的电影名，主演，上映时间
    """
    print('开始抓取电影名，主演，上映时间...')
    for offset in range(0, 91, 10):
        url = 'http://maoyan.com/board/4?offset=' + str(offset)
        print('当前url：' + url)
        # 抓取当前的页面
        html = get_one_page(url)
        if html is None:
            print("此网页爬取失败：" + url)
            continue
        # print(html)
        # 这里的for
        for item in parse_one_page(html):
            # write_to_file(item)
            write_to_sql(item)
        # 每次下载完一个页面，随机等待1-3秒再次去抓取下一个页面
        time.sleep(random.randint(5, 7))
    ########################################
    # url = 'http://maoyan.com/board/4?offset=' + str(30)+'&requestCode=9ef3f9efc96b176ed59b62bdd5f3d77ekgeh6'
    # print('当前url：'+url)
    # #     # 抓取当前的页面
    # html = get_one_page(url)
    # if html is None:
    #     print("此网页爬取失败："+url)
    # for item in parse_one_page(html):
    #     write_to_sql(item)


if __name__ == "__main__":
    # 爬取网页数据,因为这里已经都爬到数据库了，所以便注释了，需要请打开
    # CrawlMovieInfo()

    # 绘制饼图
    analysisCounry()
